<!DOCTYPE html>
<html lang="en-us">
  <head>

    <meta http-equiv="content-type" content="text/html; charset=utf-8">
    
<meta charset="UTF-8">
<title>Pluggable Similarity Algorithms | Elasticsearch: The Definitive Guide [2.x] | Elastic</title>
<link rel="home" href="index.html" title="Elasticsearch: The Definitive Guide [2.x]">
<link rel="up" href="controlling-relevance.html" title="Controlling Relevance">
<link rel="prev" href="script-score.html" title="Scoring with Scripts">
<link rel="next" href="changing-similarities.html" title="Changing Similarities">
<meta name="DC.type" content="Learn/Docs/Legacy/Elasticsearch/Definitive Guide/2.x">
<meta name="DC.subject" content="Elasticsearch">
<meta name="DC.identifier" content="2.x">
<meta name="robots" content="noindex,nofollow">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <script src="https://cdn.optimizely.com/js/18132920325.js"></script>
    <link rel="apple-touch-icon" sizes="57x57" href="/apple-icon-57x57.png">
    <link rel="apple-touch-icon" sizes="60x60" href="/apple-icon-60x60.png">
    <link rel="apple-touch-icon" sizes="72x72" href="/apple-icon-72x72.png">
    <link rel="apple-touch-icon" sizes="76x76" href="/apple-icon-76x76.png">
    <link rel="apple-touch-icon" sizes="114x114" href="/apple-icon-114x114.png">
    <link rel="apple-touch-icon" sizes="120x120" href="/apple-icon-120x120.png">
    <link rel="apple-touch-icon" sizes="144x144" href="/apple-icon-144x144.png">
    <link rel="apple-touch-icon" sizes="152x152" href="/apple-icon-152x152.png">
    <link rel="apple-touch-icon" sizes="180x180" href="/apple-icon-180x180.png">
    <link rel="icon" type="image/png" href="/favicon-32x32.png" sizes="32x32">
    <link rel="icon" type="image/png" href="/android-chrome-192x192.png" sizes="192x192">
    <link rel="icon" type="image/png" href="/favicon-96x96.png" sizes="96x96">
    <link rel="icon" type="image/png" href="/favicon-16x16.png" sizes="16x16">
    <link rel="manifest" href="/manifest.json">
    <meta name="apple-mobile-web-app-title" content="Elastic">
    <meta name="application-name" content="Elastic">
    <meta name="msapplication-TileColor" content="#ffffff">
    <meta name="msapplication-TileImage" content="/mstile-144x144.png">
    <meta name="theme-color" content="#ffffff">
    <meta name="naver-site-verification" content="936882c1853b701b3cef3721758d80535413dbfd">
    <meta name="yandex-verification" content="d8a47e95d0972434">
    <meta name="localized" content="true">
    <meta name="st:robots" content="follow,index">
    <meta property="og:image" content="https://www.elastic.co/static/images/elastic-logo-200.png">
    <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">
    <link rel="icon" href="/favicon.ico" type="image/x-icon">
    <link rel="apple-touch-icon-precomposed" sizes="64x64" href="/favicon_64x64_16bit.png">
    <link rel="apple-touch-icon-precomposed" sizes="32x32" href="/favicon_32x32.png">
    <link rel="apple-touch-icon-precomposed" sizes="16x16" href="/favicon_16x16.png">
    <!-- Give IE8 a fighting chance -->
    <!--[if lt IE 9]>
    <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
    <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
    <![endif]-->
    <link rel="stylesheet" type="text/css" href="/guide/static/styles.css">
  </head>

  <!--© 2015-2021 Elasticsearch B.V. Copying, publishing and/or distributing without written permission is strictly prohibited.-->

  <body>
    <!-- Google Tag Manager -->
    <script>dataLayer = [];</script><noscript><iframe src="//www.googletagmanager.com/ns.html?id=GTM-58RLH5" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
    <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= '//www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); })(window,document,'script','dataLayer','GTM-58RLH5');</script>
    <!-- End Google Tag Manager -->

    <!-- Global site tag (gtag.js) - Google Analytics -->
    <script async src="https://www.googletagmanager.com/gtag/js?id=UA-12395217-16"></script>
    <script>
      window.dataLayer = window.dataLayer || [];
      function gtag(){dataLayer.push(arguments);}
      gtag('js', new Date());
      gtag('config', 'UA-12395217-16');
    </script>

    <!--BEGIN QUALTRICS WEBSITE FEEDBACK SNIPPET-->
    <script type="text/javascript">
      (function(){var g=function(e,h,f,g){
      this.get=function(a){for(var a=a+"=",c=document.cookie.split(";"),b=0,e=c.length;b<e;b++){for(var d=c[b];" "==d.charAt(0);)d=d.substring(1,d.length);if(0==d.indexOf(a))return d.substring(a.length,d.length)}return null};
      this.set=function(a,c){var b="",b=new Date;b.setTime(b.getTime()+6048E5);b="; expires="+b.toGMTString();document.cookie=a+"="+c+b+"; path=/; "};
      this.check=function(){var a=this.get(f);if(a)a=a.split(":");else if(100!=e)"v"==h&&(e=Math.random()>=e/100?0:100),a=[h,e,0],this.set(f,a.join(":"));else return!0;var c=a[1];if(100==c)return!0;switch(a[0]){case "v":return!1;case "r":return c=a[2]%Math.floor(100/c),a[2]++,this.set(f,a.join(":")),!c}return!0};
      this.go=function(){if(this.check()){var a=document.createElement("script");a.type="text/javascript";a.src=g;document.body&&document.body.appendChild(a)}};
      this.start=function(){var a=this;window.addEventListener?window.addEventListener("load",function(){a.go()},!1):window.attachEvent&&window.attachEvent("onload",function(){a.go()})}};
      try{(new g(100,"r","QSI_S_ZN_emkP0oSe9Qrn7kF","https://znemkp0ose9qrn7kf-elastic.siteintercept.qualtrics.com/WRSiteInterceptEngine/?Q_ZID=ZN_emkP0oSe9Qrn7kF")).start()}catch(i){}})();
    </script><div id="ZN_emkP0oSe9Qrn7kF"><!--DO NOT REMOVE-CONTENTS PLACED HERE--></div>
    <!--END WEBSITE FEEDBACK SNIPPET-->

    <div id="elastic-nav" style="display:none;"></div>
    <script src="https://www.elastic.co/elastic-nav.js"></script>

    <!-- Subnav -->
    <div>
      <div>
        <div class="tertiary-nav d-none d-md-block">
          <div class="container">
            <div class="p-t-b-15 d-flex justify-content-between nav-container">
              <div class="breadcrum-wrapper"><span><a href="/guide/" style="font-size: 14px; font-weight: 600; color: #000;">Docs</a></span></div>
            </div>
          </div>
        </div>
      </div>
    </div>

    <div class="main-container">
      <section id="content">
        <div class="content-wrapper">

          <section id="guide" lang="en">
            <div class="container">
              <div class="row">
                <div class="col-xs-12 col-sm-8 col-md-8 guide-section">
                  <!-- start body -->
                  <div class="page_header">
<p>
  <strong>WARNING</strong>: The 2.x versions of Elasticsearch have passed their
  <a href="https://www.elastic.co/support/eol">EOL dates</a>. If you are running
  a 2.x version, we strongly advise you to upgrade.
</p>
<p>
  This documentation is no longer maintained and may be removed. For the latest
  information, see the <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html">current
  Elasticsearch documentation</a>.
</p>
</div>
<div id="content">
<div class="breadcrumbs">
<span class="breadcrumb-link"><a href="index.html">Elasticsearch: The Definitive Guide [2.x]</a></span>
»
<span class="breadcrumb-link"><a href="search-in-depth.html">Search in Depth</a></span>
»
<span class="breadcrumb-link"><a href="controlling-relevance.html">Controlling Relevance</a></span>
»
<span class="breadcrumb-node">Pluggable Similarity Algorithms</span>
</div>
<div class="navheader">
<span class="prev">
<a href="script-score.html">« Scoring with Scripts</a>
</span>
<span class="next">
<a href="changing-similarities.html">Changing Similarities »</a>
</span>
</div>
<div class="section">
<div class="titlepage"><div><div>
<h2 class="title">
<a id="pluggable-similarites"></a>Pluggable Similarity Algorithms<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/170_Relevance/70_Pluggable_similarities.asciidoc">edit</a>
</h2>
</div></div></div>
<p>Before we move on from relevance and scoring, we will finish this chapter with
a more advanced subject: pluggable similarity algorithms. While Elasticsearch
uses the <a class="xref" href="practical-scoring-function.html" title="Lucene’s Practical Scoring Function">Lucene’s Practical Scoring Function</a> as its default similarity algorithm,
it supports other algorithms out of the box, which are listed
in the <a href="/guide/en/elasticsearch/reference/2.4/index-modules-similarity.html#configuration" class="ulink" target="_top">Similarity Modules</a> documentation.</p>
<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="bm25"></a>Okapi BM25<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/170_Relevance/70_Pluggable_similarities.asciidoc">edit</a>
</h3>
</div></div></div>
<p>The most interesting competitor to TF/IDF and the vector space model is called
<a href="http://en.wikipedia.org/wiki/Okapi_BM25" class="ulink" target="_top"><em>Okapi BM25</em></a>, which is considered to
be a <em>state-of-the-art</em> ranking function. BM25 originates from the
<a href="http://en.wikipedia.org/wiki/Probabilistic_relevance_model" class="ulink" target="_top">probabilistic relevance model</a>,
rather than the vector space model, yet the algorithm has a lot in common with
Lucene’s practical scoring function.</p>
<p>Both use term frequency, inverse document frequency, and field-length
normalization, but the definition of each of these factors is a little
different.  Rather than explaining the BM25 formula in detail, we will focus
on the practical advantages that BM25 offers.</p>
<div class="section">
<div class="titlepage"><div><div>
<h4 class="title">
<a id="bm25-saturation"></a>Term-frequency saturation<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/170_Relevance/70_Pluggable_similarities.asciidoc">edit</a>
</h4>
</div></div></div>
<p>Both TF/IDF and BM25 use <a class="xref" href="scoring-theory.html#idf" title="Inverse document frequency">inverse document frequency</a> to distinguish
between common (low value) words and uncommon (high value) words.  Both also
recognize (see <a class="xref" href="scoring-theory.html#tf" title="Term frequency">Term frequency</a>) that the more often a word appears in a document, the
more likely is it that the document is relevant for that word.</p>
<p>However, common words occur commonly.  The fact that a common word appears
many times in one document is offset by the fact that the word appears many
times in <em>all</em> documents.</p>
<p>However, TF/IDF was designed in an era when it was standard practice to
remove the <em>most</em> common words (or <em>stopwords</em>, see <a class="xref" href="stopwords.html" title="Stopwords: Performance Versus Precision"><em>Stopwords: Performance Versus Precision</em></a>) from the
index altogether. The algorithm didn’t need to worry about an upper limit for
term frequency because the most frequent terms had already been removed.</p>
<p>In Elasticsearch, the <code class="literal">standard</code> analyzer—​the default for <code class="literal">string</code> fields—​doesn’t remove stopwords because, even though they are words of little
value, they do still have some value.  The result is that, for very long
documents, the sheer number of occurrences of words like <code class="literal">the</code> and <code class="literal">and</code> can
artificially boost their weight.</p>
<p>BM25, on the other hand, does have an upper limit.  Terms that appear 5 to 10
times in a document have a significantly larger impact on relevance than terms
that appear just once or twice.  However, as can be seen in <a class="xref" href="pluggable-similarites.html#img-bm25-saturation" title="Term frequency saturation for TF/IDF and BM25">Figure 34, “Term frequency saturation for TF/IDF and BM25”</a>, terms that appear 20 times in a
document have almost the same impact as terms that appear a thousand times or
more.</p>
<p>This is known as <em>nonlinear term-frequency saturation</em>.</p>
<div id="img-bm25-saturation" class="imageblock">
<div class="content">
<img src="images/elas_1706.png" alt="Term frequency saturation for TF/IDF and BM25">
</div>
<div class="title">Figure 34. Term frequency saturation for TF/IDF and BM25</div>
</div>
</div>

<div class="section">
<div class="titlepage"><div><div>
<h4 class="title">
<a id="bm25-normalization"></a>Field-length normalization<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/170_Relevance/70_Pluggable_similarities.asciidoc">edit</a>
</h4>
</div></div></div>
<p>In <a class="xref" href="scoring-theory.html#field-norm" title="Field-length norm">Field-length norm</a>, we said that Lucene considers shorter fields to have
more weight than longer fields: the frequency of a term in a field is offset
by the length of the field.  However, the practical scoring function treats
all fields in the same way.  It will treat all <code class="literal">title</code> fields (because they
are short) as more important than all <code class="literal">body</code> fields (because they are long).</p>
<p>BM25 also considers shorter fields to have more weight than longer fields, but
it considers each field separately by taking the average length of the field
into account. It can distinguish between a short <code class="literal">title</code> field and a <code class="literal">long</code>
title field.</p>
<div class="caution admon">
<div class="icon"></div>
<div class="admon_content">
<p>In <a class="xref" href="query-time-boosting.html" title="Query-Time Boosting">Query-Time Boosting</a>, we said that the <code class="literal">title</code> field has a
<em>natural</em> boost over the <code class="literal">body</code> field because of its length.  This natural
boost disappears with BM25 as differences in field length apply only within a
single field.</p>
</div>
</div>
</div>

<div class="section">
<div class="titlepage"><div><div>
<h4 class="title">
<a id="bm25-tunability"></a>Tuning BM25<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/170_Relevance/70_Pluggable_similarities.asciidoc">edit</a>
</h4>
</div></div></div>
<p>One of the nice features of BM25 is that, unlike TF/IDF, it has two parameters
that allow it to be tuned:</p>
<div class="variablelist">
<dl class="variablelist">
<dt>
<span class="term">
<code class="literal">k1</code>
</span>
</dt>
<dd>
This parameter controls how quickly an increase in term frequency results
in term-frequency saturation.  The default value is <code class="literal">1.2</code>. Lower values
result in quicker saturation, and higher values in slower saturation.
</dd>
<dt>
<span class="term">
<code class="literal">b</code>
</span>
</dt>
<dd>
This parameter controls how much effect field-length normalization should
have. A value of <code class="literal">0.0</code> disables normalization completely, and a value of
<code class="literal">1.0</code> normalizes fully. The default is <code class="literal">0.75</code>.
</dd>
</dl>
</div>
<p>The practicalities of tuning BM25 are another matter. The default values for
<code class="literal">k1</code> and <code class="literal">b</code> should be suitable for most document collections, but the
optimal values really depend on the collection.  Finding good values for your
collection is a matter of adjusting, checking, and adjusting again.</p>
</div>

</div>

</div>
<div class="navfooter">
<span class="prev">
<a href="script-score.html">« Scoring with Scripts</a>
</span>
<span class="next">
<a href="changing-similarities.html">Changing Similarities »</a>
</span>
</div>
</div>

                  <!-- end body -->
                </div>
                <div class="col-xs-12 col-sm-4 col-md-4" id="right_col">
                  <div id="rtpcontainer" style="display: block;">
                    <div class="mktg-promo">
                      <h3>Most Popular</h3>
                      <ul class="icons">
                        <li class="icon-elasticsearch-white"><a href="https://www.elastic.co/webinars/getting-started-elasticsearch?baymax=default&amp;elektra=docs&amp;storm=top-video">Get Started with Elasticsearch: Video</a></li>
                        <li class="icon-kibana-white"><a href="https://www.elastic.co/webinars/getting-started-kibana?baymax=default&amp;elektra=docs&amp;storm=top-video">Intro to Kibana: Video</a></li>
                        <li class="icon-logstash-white"><a href="https://www.elastic.co/webinars/introduction-elk-stack?baymax=default&amp;elektra=docs&amp;storm=top-video">ELK for Logs &amp; Metrics: Video</a></li>
                      </ul>
                    </div>
                  </div>
                </div>
              </div>
            </div>
          </section>

        </div>


<div id="elastic-footer"></div>
<script src="https://www.elastic.co/elastic-footer.js"></script>
<!-- Footer Section end-->

      </section>
    </div>

<script src="/guide/static/jquery.js"></script>
<script type="text/javascript" src="/guide/static/docs.js"></script>
<script type="text/javascript">
  window.initial_state = {}</script>
  </body>
</html>
